/*******************************************************************************
																				
	DESCRIPTION:  	This do file creates the multiple spells sample.
	
*******************************************************************************/

clear all
global id_code 134
set seed 2110

/*******************************************************************************
*	Merge and clean predictions for the JFR at 0M
********************************************************************************/

* Set macros:
global vars _
global model Full

* Load 2006 predictions:
use "${data}/003_MainWithEnsemblePred_${model}${vars}2006.dta", clear

* Keep only the 6M JFR at 0 and 6M
keep LopNr_PersonNr InLnr year n inSample ///
	emplAft6M_0M_In p_emplAft6M_0M_In ///
	emplAft6M_6M_In p_emplAft6M_6M_In ///
	startU trueEnd duration
	
	
gen DataYear = 2006 // DataYear indicates which year individuals comes from

* Append other years:
foreach year of numlist 1992/2005 2007/2016  {
	append using "${data}/003_MainWithEnsemblePred_${model}${vars}`year'.dta", ///
	keep(LopNr_PersonNr InLnr year n inSample ///
		emplAft6M_0M_In p_emplAft6M_0M_In ///
		emplAft6M_6M_In p_emplAft6M_6M_In ///
		startU trueEnd duration)
		
	replace DataYear = `year' if DataYear==.
}
		
compress
		
/* For spells that take place in at least two calendar years, there are often two
observations, as predictions are created separately for each year. Importantly,
each dependent variable is still only predicted once (in the year in which
the relevent month of unemployment is reached). Now we combine observations
so that there is only one observation per spell.*/
	
foreach months in 6 {
	foreach unempl in 0 6 {	
			
		sort LopNr_PersonNr InLnr DataYear
				
		by LopNr_PersonNr InLnr: egen temp0 = mean(p_emplAft`months'M_`unempl'M_In)
		by LopNr_PersonNr InLnr: egen temp1 = mean(emplAft`months'M_`unempl'M_In)

		drop p_emplAft`months'M_`unempl'M_In emplAft`months'M_`unempl'M_In
		rename temp0 p_emplAft`months'M_`unempl'M_In
		rename temp1 emplAft`months'M_`unempl'M_In

	}
}

drop DataYear
duplicates drop

* Finally, merge with the predictions of the 6M model for the 0M sample:
forval year = 1992/2016  {
	merge 1:1 LopNr_PersonNr InLnr using "${data}/003_MainWithEnsemblePred_${model}${vars}`year'_xMonthPred_yMonthModel.dta", ///
	keepusing(p_emplAft6M_0M_In_6M_Mod) update nogen
		
}

* Quick check that we have one observation per spell:
unique LopNr_PersonNr InLnr

* Drop observations with missing values in either predictions or outcomes:
drop if missing(emplAft6M_0M_In) | missing(p_emplAft6M_0M_In)

/*******************************************************************************
*	Build 2-spell sample:
********************************************************************************/

frame copy default twospell, replace

frame change twospell
	* Generate number of spell variable and drop people with only 1 spell:
	bysort LopNr_PersonNr: gen N_spells = _N

	tab N_spells

	* Generate histogram:
	histogram N_spells, ///
		frequency width(1) start(0.5) fcolor(ebblue*0.5) lcolor(ebblue) ///
		graphregion(color(white)) ///
		ytitle("Frequency", size(9pt)) ///
		xtitle("Number of Unemployment Spells (1992-2016)", size(9pt)) ///
		ylabel(0(200000)1200000, angle(0) format(%9.0fc)) yscale(titlegap(2)) ///
		xlabel(0(5)35) xmtick(0/36) xscale(titlegap(2) axis(1)) 					///
		name(distSpells, replace)

	drop if N_spells <= 1

	save "${data}/${id_code}_MultipleSpellSample.dta", replace
	
	
	* Randomly sort the data and pick the first two spells for all individuals:
	set seed 2110
	gen rand = runiform()

frame change default

* Match each multiple spell entry with a control entry from the same year (for this, 
* we randomly shuffle both datasets within year):
gen temp = runiform() // Generate random sorting
sort year temp
bysort year: gen random_id = _n

drop temp
rename (p_emplAft6M_0M_In emplAft6M_0M_In p_emplAft6M_0M_In_6M_Mod) (p_emplAft6M_0M_InC emplAft6M_0M_InC p_emplAft6M_0M_In_6M_ModC)


frame change twospell
	
	gen old_sort = _n

	gen temp = runiform()
	sort year temp
	bysort year: gen random_id = _n

	drop temp	
	frlink 1:1 year random_id, frame(default)
	frget p_emplAft6M_0M_InC emplAft6M_0M_InC p_emplAft6M_0M_In_6M_ModC, from(default)
	
	sort old_sort
	
	drop random_id default old_sort
	
	preserve
		drop rand
		save "${data}/${id_code}_MultipleSpellSample.dta", replace
	restore

	* Remove spells beyond 2:
	sort LopNr_PersonNr rand

	by LopNr_PersonNr: gen n_spell = _n
	keep if n_spell <= 2

	drop rand

	* Transform to wide format:
	reshape wide InLnr n year startU trueEnd duration emplAft6M_0M_In p_emplAft6M_0M_In emplAft6M_6M_In p_emplAft6M_6M_In p_emplAft6M_0M_In_6M_Mod p_emplAft6M_0M_InC emplAft6M_0M_InC p_emplAft6M_0M_In_6M_ModC, i(LopNr_PersonNr) ///
		j(n_spell)
		
	* Exclusions: remove individuals who have the two spells in the same year, as well
	* as individuals who have overlapping spells:
	gen yeardiff = year2 - year1
	gen exactdiff = startU2 - startU1
	gen exactdiff_end_start = startU2 - trueEnd1

	drop if (startU2 > startU1) & (startU2 <= trueEnd1) | (startU1 > startU2) & (startU1 <= trueEnd2)
	drop if year2 == year1


	* Copy for control sample:
	frame copy twospell control_0, replace

	* Save data:
	save "${data}/${id_code}_TwoSpellSample.dta", replace
	
frame change default


/*******************************************************************************
*	Diagnostics graphs
********************************************************************************/

frame change twospell

	* Generate histogram by year:
	twoway ///
		(histogram year1, ///
			frequency width(1) start(1991.5) fcolor(ebblue%40) lcolor(ebblue)) ///
		(histogram year2, ///
			frequency width(1) start(1991.5) fcolor(orange_red%40) lcolor(orange_red)), ///
		graphregion(color(white)) ///
		legend(order(1 "First Spell" 2 "Second Spell")) ///
		ytitle("Frequency", size(9pt)) ///
		xtitle("Year", size(9pt)) ///
		ylabel(0(20000)60000, angle(0) format(%9.0fc)) yscale(titlegap(2)) ///
		xlabel(1992(4)2016) xmtick(1992/2016) xscale(titlegap(2) axis(1)) 					///
		name(dist_year, replace)
		
	* Plot also distribution of the time difference:

	* From start to start, in years:
	twoway ///
		(histogram yeardiff, ///
			frequency width(1) start(-24.5) fcolor(ebblue*0.5) lcolor(ebblue)), ///
		graphregion(color(white)) ///
		ytitle("Frequency", size(9pt)) ///
		xtitle("Time difference between spells (years)", size(9pt)) ///
		ylabel(0(20000)100000, angle(0) format(%9.0fc)) yscale(titlegap(2)) ///
		xlabel(-24(4)24) xmtick(-24/24) xscale(titlegap(2) axis(1)) 					///
		name(dist_yeardiff, replace)
		
	gen abs_yeardiff = abs(yeardiff)

	twoway ///
		(histogram abs_yeardiff, ///
			frequency width(1) start(-0.5) fcolor(ebblue*0.5) lcolor(ebblue)), ///
		graphregion(color(white)) ///
		ytitle("Frequency", size(9pt)) ///
		xtitle("Absolute time difference between spells (years)", size(9pt)) ///
		ylabel(0(25000)175000, angle(0) format(%9.0fc)) yscale(titlegap(2)) ///
		xlabel(0(4)24) xmtick(0/24) xscale(titlegap(2) axis(1)) 					///
		name(dist_yeardiff, replace)
		
	graph export "${output}/${id_code}_TwoSpells_histogram_timeDiff_years.pdf", replace
		
	* From start to start, in days:
	twoway ///
		(histogram exactdiff, ///
			frequency /* width(1) start(-0.5) */ fcolor(ebblue*0.5) lcolor(ebblue)), ///
		graphregion(color(white)) ///
		ytitle("Frequency", size(9pt)) ///
		xtitle("Time difference between spells (days)", size(9pt)) ///
		ylabel(0(20000)80000, angle(0) format(%9.0fc)) yscale(titlegap(2)) ///
		xlabel() xmtick() xscale(titlegap(2) axis(1)) 					///
		name(dist_exactdiff, replace)

	* From end to start, in days:
	twoway ///
		(histogram exactdiff_end_start, ///
			frequency /* width(1) start(-0.5) */ fcolor(ebblue*0.5) lcolor(ebblue)), ///
		graphregion(color(white)) ///
		ytitle("Frequency", size(9pt)) ///
		xtitle("Time difference between spells (days)", size(9pt)) ///
		ylabel(0(20000)80000, angle(0) format(%9.0fc)) yscale(titlegap(2)) ///
		xlabel() xmtick() xscale(titlegap(2) axis(1)) 					///
		name(dist_exactdiff, replace)

frame change default